import urllib.request
import re
import http.cookiejar
import os
from bs4 import BeautifulSoup
from dbutil import dbutil
from logutil import logutil

#download pictures from mainSite
mainSite='http://desk.zol.com.cn/'
#save pics to folder
folder='pics'
#log file names nitu.log
log=logutil('nitu.log')
urlPattern=re.compile(
        r'^(?:http|ftp)s?://' # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
        r'localhost|' #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)

if not os.path.exists(folder):
    os.mkdir(folder)

def downloadOne(url):
    """
    download images,save urls in one page
    """
    if len(url)>256:
        return
    try:
        dbutil().update("update link set visited=1 where link='%s'" % url)
        log.log("%s begins..." % url)
        linkRoot=os.path.split(url)[0]+'/'
        f=urllib.request.urlopen(url)
        if not linkRoot[-1]=='/':
            linkRoot=linkRoot+'/'
        analyse(f.read(),linkRoot)
        log.log("%s ends..." % url)

        log.log("%s savePics begins..." % url)
        savePics()
        log.log("%s savePics ends..." % url)

    #you want it to stop,use Ctrl+c
    except KeyboardInterrupt:
        print('exit now')
        exit(0)
    except Exception as e:
        print(e)

def analyse(file,dir):
    """
    analyse file,save link to db,save pictures to folder
    """
    soup=BeautifulSoup(file)
    
    for link in soup.find_all('a'):
        insertLink=link.get('href')
        if not insertLink or insertLink=='':
            continue
        if not insertLink.startswith('http://'):
            insertLink=dir+insertLink
        if insertLink.startswith(mainSite):
            if urlPattern.match(insertLink):
                if not (insertLink,) in dbutil().query('select link from link'):
                    #if insertLink.startswith(''):
                    dbutil().update("insert into link(link,visited) values('%s',0)" % insertLink)
    
    for link in soup.find_all('img'):
        insertLink=link.get('src')
        if not insertLink or insertLink=='':
            continue
        if not insertLink.startswith('http://'):
            insertLink=dir+insertLink
        if not (insertLink,) in dbutil().query('select link from link'):
            dbutil().update("insert into picture(link,visited) values('%s',0)" % insertLink)

def savePics():
    """
    real save,update records in db,if saved successful visited=1,failed visited=2,unvisited visited=0
    """
    piclinks=dbutil().query("select link from picture where visited=0")
    for piclink in piclinks:
        try:
            with open('pics/'+os.path.split(piclink[0])[1],'wb')as file:
                file.write(urllib.request.urlopen(piclink[0]).read())
            dbutil().update("update picture set visited=1 where link='%s'" % piclink[0])
        except Exception as e:
            dbutil().update("update picture set visited=2 where link='%s'" % piclink[0])
            log.log(e)

def main():
    #the whole program starts from mainSite,everytime
    downloadOne(mainSite)
    links=dbutil().query("select link from link where visited=0")
    while len(links)>0:
        for link in links:
            downloadOne(link[0])
        links=dbutil().query("select link from link where visited=0")
        
if __name__=='__main__':
    main()
